## Explainability
#!pip install -r requirements.txt --user
#!pip install numpy
import pandas as pd
import numpy as np
import matplotlib
from helpful_util import load_models_lendingclub, display_sklearn_feature_importance, Perturb, save_obj, load_obj
import warnings
warnings.filterwarnings('ignore')
import keras
import lime
import lime.lime_tabular
%matplotlib inline
#Loading COO Matrices - sparse with OHE of all original Features
#Load UDFs to use later.
#Keras and Sklearn have different ways of showing prediction probabilities. Keras == model.predict, sklearn == model.predict_proba
def explain_lime(model, train, test, observation, seed = 1, num_features = 10):
'''
Parameters:
model: object,
Random Forest: rfc
Gradient Boosted Classifier: gbc
Logistic Regression: logit
Keras Neural Network = keras_ann
Sklearn Neural Network = sk_ann
train: object; train set dataframe
test: object; test set dataframe
observation: int (index of explanation on test data set)
seed: int; default to 1
num_features: int, the number of features to be displayed in explanation
Returns:
Local explanations over test set - Summary
'''
np.random.seed(seed)
i = observation
if type(model) == keras.engine.sequential.Sequential:
predict_fn = lambda x: model.predict(encoder.transform(x).toarray())
else:
predict_fn = lambda x: model.predict_proba(encoder.transform(x)).astype(float)
explainer = lime.lime_tabular.LimeTabularExplainer(
train,
feature_names=feature_names,
class_names=class_names,
categorical_features=categorical_features,
categorical_names=categorical_names,
kernel_width=3)
exp = explainer.explain_instance(test[i], predict_fn, num_features= num_features)
exp.show_in_notebook(show_all=True)
print('-' * 125)
print('Model: {}'.format(models[str(type(model))][0]))
print("Looking at Observation: {}".format(i))
print("Actual Class Label: {}".format(y_test[i]))
%matplotlib inline
try: #If Keras
print("Predicted Class Label: {}".format(np.argmax(model.predict(encoder.transform(test).toarray()),axis = 1)[i]))
print("Prediction Confidence (Prob): {0:.2f}%".format(np.max(model.predict(encoder.transform(test).toarray())[i]*100)))
print('-' * 125)
print("Displaying Local Explanation for Prediction......")
except: #If Sklearn
print("Predicted Class Label: {}".format(np.argmax(model.predict_proba(encoder.transform(test[i].reshape(1,-1))).astype(float))
))
print("Prediction Confidence (Prob): {0:.2f}%".format(np.max(model.predict_proba(encoder.transform(test[i].reshape(1,-1)))[0])*100))
print('-' * 125)
print("Displaying Local Explanation for Prediction: Top {} Features......".format(num_features))
fig = exp.as_pyplot_figure()
Load Data Objects. Stored these locally/remotely to avoid dependency issues
#list of all features used
feature_names = [
'loan_amnt', 'term', 'int_rate', 'installment', 'grade', 'sub_grade',
'emp_length', 'home_ownership', 'annual_inc', 'title', 'inq_last_6mths',
'revol_bal', 'total_pymnt', 'total_rec_late_fee', 'last_pymnt_amnt',
'acc_open_past_24mths', 'delinq_amnt', 'tax_liens', 'tot_hi_cred_lim',
'total_bal_ex_mort', 'total_bc_limit',
'total_il_high_credit_limit', 'loan_condition'
]
#List only Continuous Feats.
continuous = [
'loan_amnt', 'int_rate', 'installment', 'annual_inc', 'inq_last_6mths',
'revol_bal', 'total_pymnt', 'total_rec_late_fee', 'last_pymnt_amnt',
'acc_open_past_24mths', 'delinq_amnt', 'tax_liens', 'tot_hi_cred_lim',
'total_bal_ex_mort', 'total_bc_limit',
'total_il_high_credit_limit'
]
#Load Models
rfc, gbc, logit, keras_ann, sk_ann = load_models_lendingclub()
#indices of cat features
categorical_features = [1, 4, 5, 6, 7, 9] #Get Nominal / Ordinal / etc..
#Load Dictionary of categorical features after encoded. Need this for Lime
categorical_names = load_obj('data_objects/categorical_names')
#all features as a list. Unwrapping encoding
features = []
for k in categorical_names.values():
for i in k:
features.append(i)
#Clean up Feats
features[:2] = ['Term:' + i for i in features[:2]]
features[2:9] = ['Loan_Grade:' + i for i in features[2:9]]
features[9:44] = ['Loan_SubGrade:' + i for i in features[9:44]]
features[44:56] = ['Employment_Length:' + i for i in features[44:56]]
features[56:60] = ['Home_Ownership:' + i for i in features[56:60]]
features[60:] = ['Loan_Title:' + i for i in features[60:]]
#Concatenate encoded features + continuous features
features = features + continuous
#Load necessary data objects. Pre encoded Data
X_train = load_obj('data_objects/X_train')
X_test = load_obj('data_objects/X_test')
y_train = load_obj('data_objects/y_train')
y_test = load_obj('data_objects/y_test')
#Load encoded data that models were trained on.
encoded_train = load_obj('data_objects/encoded_train')
encoded_test = load_obj('data_objects/encoded_test')
data = load_obj('data_objects/data')
encoder = load_obj('data_objects/encoder')
#Split
#Manual perturbations
#Generate a sample of the test set for feature perturbance
X_test_holdout = load_obj('data_objects/X_test')
idx = np.random.choice(X_test_holdout.shape[0], 2000,
replace=False) #Random 2000 samples w/o replacements
X_test_holdout = X_test_holdout[idx] #extract
X_test_holdout = pd.DataFrame(
encoder.transform(X_test_holdout).toarray(),
columns=features) #Convert to DF for column names\
y_test_holdout = y_test[idx]
X_train_shap = pd.DataFrame(encoded_train.toarray(), columns=features)
X_test_shap = pd.DataFrame(encoder.transform(X_test).toarray(),
columns=features)
shap_values = load_obj('data_objects/shap_values') #Load Dict of Shap Values
models = {
str(type(rfc)): ('Random Forest', shap_values[str(type(rfc))]),
str(type(gbc)):
('Gradient Boosted Classifier', shap_values[str(type(gbc))]),
str(type(logit)): ('Logistic Regression', shap_values[str(type(logit))]),
str(type(sk_ann)):
('Sklearn MultiLayer Perceptron', shap_values[str(type(sk_ann))]),
str(type(keras_ann)):
('Keras Multilayer Perceptron', shap_values[str(type(keras_ann))])
}
'''
annualInc The self-reported annual income provided by the borrower during registration.
delinqAmnt The past-due amount owed for the accounts on which the borrower is now delinquent.
empLength Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years.
grade LC assigned loan grade
homeOwnership The home ownership status provided by the borrower during registration. Our values are: RENT, OWN, MORTGAGE, OTHER.
inqLast6Mths The number of inquiries in past 6 months (excluding auto and mortgage inquiries)
installment The monthly payment owed by the borrower if the loan originates.
intRate Interest Rate on the loan
loanAmnt The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.
revolBal Total credit revolving balance
subGrade LC assigned loan subgrade
tax_liens Number of tax liens
title The loan title provided by the borrower
tot_hi_cred_lim Total high credit/credit limit
total_il_high_credit_limit Total installment high credit/credit limit
totalBalExMort Total credit balance excluding mortgage
totalBcLimit Total bankcard high credit/credit limit
'''
Some statistics on our models
#Some classification Reports and Confusion Matrices
import sklearn
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
class_names = ['Bad Loan', 'Good Loan']
#Get Best accuracy scores
get_best_score = lambda x, y: sklearn.metrics.accuracy_score(
x, y.predict(encoded_test))
#Getting test score accuracy + printing
rf_best_score = get_best_score(y_test, rfc)
gbc_best_score = get_best_score(y_test, gbc)
lr_best_score = get_best_score(y_test, logit)
nn_best_score = get_best_score(y_test, sk_ann)
ker_nn_best_score = sklearn.metrics.accuracy_score(
y_test, np.argmax(keras_ann.predict(encoded_test.toarray()), axis=1))
print("\033[1mPrinting Test Accuracy.... \033[0m")
print("Random Forest: {0:.2f} %".format(rf_best_score * 100))
print("Gradient Boosting: {0:.2f} %".format(gbc_best_score * 100))
print("Logistic Regression: {0:.2f} %".format(lr_best_score * 100))
print("SKLEARN Neural Net: {0:.2f} %".format(nn_best_score * 100))
print("Keras Neural Net: {0:.2f} % \n \n".format(ker_nn_best_score * 100))
from pylab import rcParams
from helpful_util import plot_confusion_matrix
import matplotlib.pyplot as plt
#Printing Classification Reports and Confusion Matrices for All Models
rcParams['figure.figsize'] = 5,5 #Adjust plot size
print_class_report = lambda x, y: print(classification_report(x, y))
print("\033[1m Random Forest Classification Report...\n \033[0m")
print_class_report(y_test, rfc.predict(encoded_test))
cm = confusion_matrix(y_test, rfc.predict(encoded_test))
plot_confusion_matrix(cm, classes=class_names, normalize=True)
plt.show()
print("\033[1m Gradient Boosting Classification Report...\n \033[0m")
print_class_report(y_test, gbc.predict(encoded_test))
cm = confusion_matrix(y_test, gbc.predict(encoded_test))
plot_confusion_matrix(cm, classes=class_names, normalize=True)
plt.show()
print("\033[1m Logistic Regression Classification Report...\n \033[0m")
print_class_report(y_test, logit.predict(encoded_test))
cm = confusion_matrix(y_test, logit.predict(encoded_test))
plot_confusion_matrix(cm, classes=class_names, normalize=True)
plt.show()
print("\033[1m SKLEARN Neural Network Classification Report...\n \033[0m")
print_class_report(y_test, sk_ann.predict(encoded_test))
cm = confusion_matrix(y_test, sk_ann.predict(encoded_test))
plot_confusion_matrix(cm, classes=class_names, normalize=True)
plt.show()
print("\033[1m KERAS Neural Network Classification Report...\n \033[0m")
print_class_report(
y_test, np.argmax(keras_ann.predict(encoded_test.toarray()), axis=1))
cm = confusion_matrix(
y_test, np.argmax(keras_ann.predict(encoded_test.toarray()), axis=1))
plot_confusion_matrix(cm, classes=class_names, normalize=True)
plt.show()
Graphs Showing Standard Feature Importance via Sklearn API
#SciKit Learn Feature Importance in Linear Coefficient Rankinks
rcParams['figure.figsize'] = 5, 5 #Adjust plot size
import seaborn as sns
'''
Logistic Regression Feature importance values are shown as the maximum potential coefficient values based on the max value of
the feature in question * the weight.
I have the raw coeff. printed in tables side by side in a block below.
'''
display_sklearn_feature_importance(data = encoded_train, set = 'lending', features = features, n_features=15)
#Fetching logit coefficients and storing locally @ obj/lendingclub/logit_coefficients.csv
# + Generating standard errors
logistic_regress_coeff = pd.DataFrame({
"features": features,
"Coef": logit.coef_[0]
})
#logistic_regress_coeff.to_csv('obj/lendingclub/logit_coefficients.csv')
predProbs = logit.predict_proba(encoded_train.toarray())
X_design = np.hstack([np.ones((encoded_train.toarray().shape[0], 1)), encoded_train.toarray()])
V = np.diagflat(np.product(predProbs, axis=1))
covLogit = np.linalg.inv(X_design.T @ V @ X_design)
#print("Covariance matrix: ", covLogit)
# Standard errors
print("Standard errors: ", np.sqrt(np.diag(covLogit)))
from IPython.display import display_html
def display_side_by_side(*args):
html_str=''
for df in args:
html_str+=df.to_html()
display_html(html_str.replace('table','table style="display:inline"'),raw=True)
neg_coef = logistic_regress_coeff[logistic_regress_coeff['Coef'] <0].sort_values('Coef', ascending = True)
pos_coef = logistic_regress_coeff[logistic_regress_coeff['Coef'] >0].sort_values('Coef', ascending = False)
'''
Displaying Logit Coefficients.
Side by Side view. Left DF is negative coef, sorted by most negative first. Right is pos coefficient, sorted by most pos first
Coeff of 0 are excluded
'''
display_side_by_side(neg_coef, pos_coef)
Exploring with Manual Feature Perturbation
#Manualyl perturbing features to see impact on model output
p = Perturb(X = X_test_holdout, y = y_test_holdout, data_str= 'lending') #Instantiation. Leave alone
'''
method manual_pertub takes a column and a scalar/float as input. Input column name and the amount that you would like
to perturb the specified feature by
Can only perturb continuous feats atm
'''
p.manual_perturb(
column='total_pymnt',
scalar=1.1)
p.manual_perturb(
column='int_rate',
scalar=1.1)
p.manual_perturb(
column='loan_amnt',
scalar=1.5)
#Showing Samples
'''
perturbation graphs.
Two modes: Accuracy & proportion.
'accuracy' shows the percentage of correct predictions on the holdout set as we iterate through a range of perturbations,
with a perturbation of 1 = no perturbation at all (scalar multiple of 1 of specified colmns).
'proportion' shows the percentage of observations classified as being of class 1 as we iterate through perturbations.
model: logit / rfc / gbc
'''
p.perturb_graph(model=logit,
mode='accuracy',
column='int_rate',
title='Logit')
p.perturb_graph(model=rfc,
mode='accuracy',
column='int_rate',
title='Random Forest')
p.perturb_graph(model=gbc,
mode='accuracy',
column='int_rate',
title='Gradient Boosted Classifier')
p.perturb_graph(model=logit,
mode='proportion',
column='int_rate',
title='Logit')
p.perturb_graph(model=rfc,
mode='proportion',
column='int_rate',
title='Random Forest')
p.perturb_graph(model=gbc,
mode='proportion',
column='int_rate',
title='Gradient Boosted Classifier')
'''
perturbation graphs continued.
This shows the same view as above, except displays all models simulataneously.
Two modes: Accuracy & proportion.
'accuracy' shows the percentage of correct predictions on the holdout set as we iterate through a range of perturbations,
with a perturbation of 1 = no perturbation at all (scalar multiple of 1 of specified colmns).
'proportion' shows the percentage of observations classified as being of class 1 as we iterate through perturbations.
model: logit / rfc / gbc
'''
p.perturb_graph_cons(
mode='accuracy',
column='loan_amnt',
title='All Models')
p.perturb_graph_cons(
mode='accuracy',
column='annual_inc',
title='All Models')
p.perturb_graph_cons(
mode='accuracy',
column='int_rate',
title='All Models')
p.perturb_graph_cons(
mode='proportion',
column='loan_amnt',
title='All Models')
p.perturb_graph_cons(
mode='proportion',
column='annual_inc',
title='All Models')
p.perturb_graph_cons(
mode='proportion',
column='int_rate',
title='All Models')
'''
Local Explanations for Lime.
Leave the train and test parameters alone, but you can adjust the model and observation parameters to see local explanations
for various observations given various models.
model: keras_ann / sk_ann / logit / rfc / gbc (object, not string)
observations: didn't try catch this, but the length of the test set is somewhere around 3-4k, so any int input within that range
I added some functionality to print some logs on the prediction confidence & accuracy.
'''
explain_lime(model = keras_ann, train = X_train, test = X_test, observation=1000)
#explain_lime(model = logit, train = X_train, test = X_test, observation=1)
#explain_lime(model = gbc, train = X_train, test = X_test, observation=1)
#explain_lime(model = rfc, train = X_train, test = X_test, observation=1)
#explain_lime(model = sk_ann, train = X_train, test = X_test, observation=1)
#Testing
from helpful_util import ExplainShap
import shap
'''
Local Shap Explanations
I've instantiated with the necessary elements.
plot.shap_local_graph will show you the additive shapley contribution to a model's prediction from the base value.
red = Pushing the model output higher, blue, lower.
model: keras_ann / sk_ann / logit / rfc / gbc (object, not string)
observations: didn't try catch this, but the length of the test set is somewhere around 3-4k, so any int input within that range
Note: The base value shown below is the model's probability of predicting class 1 against the column-wise median
Output Value == The probability of an observation being in class 1.
'''
shap.initjs()
plot = ExplainShap(X_train_shap, X_test_shap, models, features)
plot.shap_local_graph(model=keras_ann, observation=1000)
#plot.shap_local_graph(model=logit, observation=2000)
#plot.shap_local_graph(model=rfc, observation=2000)
#plot.shap_local_graph(model=gbc, observation=2000)
#plot.shap_local_graph(model=sk_ann, observation=2000)
#Testing
'''
plot.shap_many_graph(model) will show an interactive mapping to visualize interaction effects vs feature values
Kind of a consolidation of partial dependency plots
'''
plot.shap_many_graph(keras_ann)